# Question: What variables contribute to making the best quality Red Wine?
# fetch the data
setwd('/Users/bryonymiles/documents/udacityprojects')
rwine <-read.csv('wineQualityReds.csv')
#Initial Analysis ----------------------------------------------
#dimensions?
dim(rwine)
## [1] 1599 13
#variables?
names(rwine)
## [1] "X" "fixed.acidity" "volatile.acidity"
## [4] "citric.acid" "residual.sugar" "chlorides"
## [7] "free.sulfur.dioxide" "total.sulfur.dioxide" "density"
## [10] "pH" "sulphates" "alcohol"
## [13] "quality"
#Field Descriptions:
#1 - fixed acidity: most wine acids involved are fixed or nonvolatile (do not evaporate readily)
#2 - volatile acidity: amount of acetic acid in wine - can be unpleasant, vinegary taste if too high?
#3 - citric acid: found in small quantities, can add 'freshness' and flavor to wines
#4 - residual sugar: sugar remaining after fermentation stops, rare < 1 gram/liter, > 45 grams/liter are considered sweet
#5 - chlorides: amount of salt in the wine
#6 - free sulfur dioxide: the free form of SO2 - prevents microbial growth and the oxidation of wine
#7 - total sulfur dioxide: free + bound forms of S02; in low concentrations, mostly undetectable in wine, free SO2 over 50 ppm, evident in the nose and taste of wine
#8 - density: the density of water is close to that of water (approx 1) depending on the percent alcohol and sugar content
#9 - pH: acidic on a scale from 0 (very acidic) to 14 (very basic); most wines are between 3-4 on the pH scale
#10 - sulphates: anadditive which can contribute to S02 levels, acts as an antimicrobial and antioxidant
#11 - alcohol: the percent alcohol content of the wine
#12 - quality (Output Variable) - sensory score between 0 and 10
#structure?
str(rwine)
## 'data.frame': 1599 obs. of 13 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
summary(rwine)
## X fixed.acidity volatile.acidity citric.acid
## Min. : 1.0 Min. : 4.60 Min. :0.1200 Min. :0.000
## 1st Qu.: 400.5 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090
## Median : 800.0 Median : 7.90 Median :0.5200 Median :0.260
## Mean : 800.0 Mean : 8.32 Mean :0.5278 Mean :0.271
## 3rd Qu.:1199.5 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420
## Max. :1599.0 Max. :15.90 Max. :1.5800 Max. :1.000
## residual.sugar chlorides free.sulfur.dioxide
## Min. : 0.900 Min. :0.01200 Min. : 1.00
## 1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00
## Median : 2.200 Median :0.07900 Median :14.00
## Mean : 2.539 Mean :0.08747 Mean :15.87
## 3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00
## Max. :15.500 Max. :0.61100 Max. :72.00
## total.sulfur.dioxide density pH sulphates
## Min. : 6.00 Min. :0.9901 Min. :2.740 Min. :0.3300
## 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500
## Median : 38.00 Median :0.9968 Median :3.310 Median :0.6200
## Mean : 46.47 Mean :0.9967 Mean :3.311 Mean :0.6581
## 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300
## Max. :289.00 Max. :1.0037 Max. :4.010 Max. :2.0000
## alcohol quality
## Min. : 8.40 Min. :3.000
## 1st Qu.: 9.50 1st Qu.:5.000
## Median :10.20 Median :6.000
## Mean :10.42 Mean :5.636
## 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :14.90 Max. :8.000
#Thoughts at this stage - Quality range is between 3 and 8 - does this correlate with anything?
#Mean/Median seem to be relatively close on all variables except total.sulfur.dioxide and chlorides - long tailed?
#GGPairs ----------------------------------------------
library('GGally')
ggpairs(rwine)

#I've commented it out and attached it as "RedwineGGPlot.pdf"
#Pearson's R results are colour code - yellow(0.3-0.5 small meaning), orange(0.5-0.7 moderate), red(0.7+ pretty large)
#Thoughts at this stage:
#Normally distributed data: density, ph
#Positively Skewed: fixed.acidity, volatile.acidity,citric acid, residual sugar, free & total sulphur dioxide, sulphates, alcohol
#Correlations? I highlighted anything with an R-score over or around 0.5
#fixed & volatile acidity
#fixed acidity & density
#fixed acidity & pH
#citric acid & pH
#density & alcohol
#quality & alcohol
#GGPairs next step: trying to normalise the skewed data ----------------------------------------------
library(ggplot2)
library(gridExtra)
p1 <- qplot(data = rwine, x = fixed.acidity,fill = I('blue'),xlab = 'Fixed Acidity Histogram')
p2 <- qplot(data = rwine, x = fixed.acidity,fill = I('blue'),xlab = 'Fixed Acidity Log 10 Histogram') + scale_x_log10()
p3 <- qplot(data = rwine, x = fixed.acidity,fill = I('blue'),xlab = 'Fixed Acidity Sqrt Histogram') + scale_x_sqrt()
p4 <- qplot(data = rwine, x = volatile.acidity,fill = I('orange'),xlab = 'Volatile Acidity Histogram')
p5 <- qplot(data = rwine, x = volatile.acidity,fill = I('orange'),xlab = 'Volatile Acidity Log 10 Histogram') + scale_x_log10()
p6 <- qplot(data = rwine, x = volatile.acidity,fill = I('orange'),xlab = 'Volatile Acidity Sqrt Histogram') + scale_x_sqrt()
p7 <- qplot(data = rwine, x = citric.acid,fill = I('purple'),xlab = 'Citric Acid Histogram')
p8 <- qplot(data = rwine, x = citric.acid,fill = I('purple'),xlab = 'Citric Acid Log 10 Histogram') + scale_x_log10()
p9 <- qplot(data = rwine, x = citric.acid,fill = I('purple'),xlab = 'Citric Acid Sqrt Histogram') + scale_x_sqrt()
p10 <- qplot(data = rwine, x = residual.sugar,fill = I('red'),xlab = 'Residual Sugar Histogram')
p11 <- qplot(data = rwine, x = residual.sugar,fill = I('red'),xlab = 'Residual Sugar Log 10 Histogram') + scale_x_log10()
p12 <- qplot(data = rwine, x = residual.sugar,fill = I('red'),xlab = 'Residual Sugar Sqrt Histogram') + scale_x_sqrt()
p13 <- qplot(data = rwine, x = total.sulfur.dioxide,fill = I('pink'),xlab = 'Total SO2 Histogram')
p14 <- qplot(data = rwine, x = total.sulfur.dioxide,fill = I('pink'),xlab = 'Total SO2 Log 10 Histogram') + scale_x_log10()
p15 <- qplot(data = rwine, x = total.sulfur.dioxide,fill = I('pink'),xlab = 'Total SO2 Sqrt Histogram') + scale_x_sqrt()
p16 <- qplot(data = rwine, x = free.sulfur.dioxide,fill = I('grey'),xlab = 'Free SO2 Histogram')
p17 <- qplot(data = rwine, x = free.sulfur.dioxide,fill = I('grey'),xlab = 'Free SO2 Log 10 Histogram') + scale_x_log10()
p18 <- qplot(data = rwine, x = free.sulfur.dioxide,fill = I('grey'),xlab = 'Free SO2 Sqrt Histogram') + scale_x_sqrt()
p19 <- qplot(data = rwine, x = sulphates,fill = I('green'),xlab = 'Sulphates Histogram')
p20 <- qplot(data = rwine, x = sulphates,fill = I('green'),xlab = 'Sulphates Log 10 Histogram') + scale_x_log10()
p21 <- qplot(data = rwine, x = sulphates,fill = I('green'),xlab = 'Sulphates Sqrt Histogram') + scale_x_sqrt()
p22 <- qplot(data = rwine, x = alcohol,fill = I('yellow'),xlab = 'Alcohol Histogram')
p23 <- qplot(data = rwine, x = alcohol,fill = I('yellow'),xlab = 'Alcohol Log 10 Histogram') + scale_x_log10()
p24 <- qplot(data = rwine, x = alcohol,fill = I('yellow'),xlab = 'Alcohol Sqrt Histogram') + scale_x_sqrt()
p25 <- qplot(data = rwine, x = chlorides,fill = I('black'),xlab = 'Chlorides Histogram')
p26 <- qplot(data = rwine, x = chlorides,fill = I('black'),xlab = 'Chlorides Log 10 Histogram') + scale_x_log10()
p27 <- qplot(data = rwine, x = chlorides,fill = I('black'),xlab = 'Chlorides Sqrt Histogram') + scale_x_sqrt()
grid.arrange(p1, p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,p26,p27)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 132 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Resulting decisions made to transform the data as follows
#1 - fixed acidity: Log 10 Transformation
#2 - volatile acidity: Log 10 Transformation
#3 - citric acid: No change - 132 0 values, transformations therefore not suitable.
#4 - residual sugar - Log 10 - this is long tailed but results could be significant
#5 - chlorides: Log 10 transformation
#6 - free sulfur dioxide: Sqrt transformation - normalises better than log 10
#7 - total sulfur dioxide: Log 10 Transformation - again long tailed but results may be significant
#8 - density: no change - normal already
#9 - pH: no change - normal already
#10 - sulphates: log10 transformation
#11 - alcohol: no change - transformations have no significant impact
#12 - quality (Output Variable) - sensory score between 0 and 10
newrwine = data.frame(rwine$X,log10(rwine$fixed.acidity),
log10(rwine$volatile.acidity), rwine$citric.acid,
log10(rwine$residual.sugar),log10(rwine$chlorides),
sqrt(rwine$free.sulfur.dioxide),log10(rwine$total.sulfur.dioxide),
rwine$density,rwine$pH,log10(rwine$sulphates),rwine$alcohol,rwine$quality)
str(newrwine)
## 'data.frame': 1599 obs. of 13 variables:
## $ rwine.X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ log10.rwine.fixed.acidity. : num 0.869 0.892 0.892 1.049 0.869 ...
## $ log10.rwine.volatile.acidity. : num -0.1549 -0.0555 -0.1192 -0.5528 -0.1549 ...
## $ rwine.citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ log10.rwine.residual.sugar. : num 0.279 0.415 0.362 0.279 0.279 ...
## $ log10.rwine.chlorides. : num -1.12 -1.01 -1.04 -1.12 -1.12 ...
## $ sqrt.rwine.free.sulfur.dioxide. : num 3.32 5 3.87 4.12 3.32 ...
## $ log10.rwine.total.sulfur.dioxide.: num 1.53 1.83 1.73 1.78 1.53 ...
## $ rwine.density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ rwine.pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ log10.rwine.sulphates. : num -0.252 -0.167 -0.187 -0.237 -0.252 ...
## $ rwine.alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ rwine.quality : int 5 5 5 6 5 5 5 7 7 5 ...
#New ggpairs - stored in Redwine_transformedGGPlot.pdf
ggpairs(newrwine)

#Pearson's R results are colour code - yellow(0.3-0.5 small meaning), orange(0.5-0.7 moderate), red(0.7+ pretty large)
#What does this mean though? I ended up plotting it on a flow diagram using draw.io - see Red Wine Correlations.png Colour coding is the same
#GGPairs Conclusions ----------------------------------------------
#1. Although Free and Total SO2 are highly correlated, they do not correlate with any other variables so I am not going to analyse them further
#2. Residual sugar seems to have a small correlation to density but nothing else. I won't analyse that further either.
#3. Start off with 3 critical factors - Alcohol, Volatile Acid and Sulphates - facet wraps on quality...
#4. Link these graphs to related variables and see if I can see any further patterns.
#Quick look at Quality ----------------------------------------------
qplot(x=quality,data=rwine)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#There majority of the data has quality level 5,6,7
#Critical Factor Analysis ----------------------------------------------
#1. ALCOHOL
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
q_groups <- group_by(rwine,quality)
rwine.alc_by_q <- summarise(q_groups,
alcohol_mean = mean(alcohol),
alcohol_median = median(alcohol),
n = n())
rwine.alc_by_q
## Source: local data frame [6 x 4]
##
## quality alcohol_mean alcohol_median n
## (int) (dbl) (dbl) (int)
## 1 3 9.955000 9.925 10
## 2 4 10.265094 10.000 53
## 3 5 9.899706 9.700 681
## 4 6 10.629519 10.500 638
## 5 7 11.465913 11.500 199
## 6 8 12.094444 12.150 18
(681+638+199)/1599
## [1] 0.9493433
#95% of the data is in the quality bracket 5-7
ggplot(rwine, aes(x=alcohol)) + geom_histogram(binwidth=0.5) + facet_grid(~quality,scales="free")

#Alcohol - the alcohol content goes up with the quality. Nothing <10 for top, nothing over 12 for bottom.
#Majority of mid range wines (5 & 6) = 10
#lets look in more detail..
ggplot(aes(y=alcohol,x=quality),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2, color = 'red') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1)) +
geom_line(stat = 'summary',fun.y= quantile, fun.args = list(probs = .9))

#once the quality gets to 5 there is a clear positive trend
#What about Density?, Fixed Acidity and pH - how does that affect the alcohol level. what makes it high?
ggplot(aes(x=alcohol,y=density,color=quality),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), na.value = "grey50", guide = "colourbar") +
stat_smooth(method = lm)

#There seems to be a negative linear correlation between alcohol and density - the content is higher when the density is lower
#How about fixed acidity and density?
ggplot(aes(x=density,y=fixed.acidity,color=quality),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), na.value = "grey50", guide = "colourbar") +
stat_smooth(method = lm)

#There seems to be a positive linear correlation here. The higher the density, the higher the fixed acidity.
#how about pH
ggplot(aes(x=fixed.acidity,y=pH,color=quality),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), na.value = "grey50", guide = "colourbar") +
stat_smooth(method = lm)

#strong negative trend here...
ggplot(aes(x=fixed.acidity,y=density,color=pH),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) + xlim(7,15) +
scale_colour_gradient(low = I('blue'), high = I('red'), na.value = "grey50", guide = "colourbar") +
stat_smooth(method = lm)
## Warning: Removed 317 rows containing non-finite values (stat_smooth).
## Warning: Removed 343 rows containing missing values (geom_point).

#But does pH figure? Tricky to tell. The Ph levels do seem higher at the 'positive' end?
#Possible hypothesis: low density levels + low fixed.acidity + higher ph levels (over 3?) = higher alcohol content = better quality wine?
#2. VOLATILE ACIDITY
rwine.va_by_q <- summarise(q_groups,
vol.acid_mean = mean(volatile.acidity),
vol.acid_median = median(volatile.acidity),
n = n())
rwine.va_by_q
## Source: local data frame [6 x 4]
##
## quality vol.acid_mean vol.acid_median n
## (int) (dbl) (dbl) (int)
## 1 3 0.8845000 0.845 10
## 2 4 0.6939623 0.670 53
## 3 5 0.5770411 0.580 681
## 4 6 0.4974843 0.490 638
## 5 7 0.4039196 0.370 199
## 6 8 0.4233333 0.370 18
#looks like a gradual descent in acidity from low to high quality
#mean and median quite similar
ggplot(rwine, aes(x=volatile.acidity)) + geom_histogram(binwidth = 0.05) + facet_wrap(~quality)

#not much more to glean from the histograms really. Distributions seem pretty similar on first glance.
ggplot(aes(x=quality,y=volatile.acidity),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
stat_smooth(method = lm)

#this plot does reflect the downward trend but there is a lot of noise
#smoother may be misleading? let's knock off 1% (outliers) and have a look at mean and quantiles
#also let's check if it's worth adding log10 or sqrt transformations.
p1 <- ggplot(aes(x=quality,y=volatile.acidity),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2, color = 'red') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1)) +
geom_line(stat = 'summary',fun.y= quantile, fun.args = list(probs = .9)) +
ylim(0,quantile(rwine$volatile.acidity,0.99))
p2 <- ggplot(aes(x=quality,y=volatile.acidity),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2, color = 'red') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1)) +
geom_line(stat = 'summary',fun.y= quantile, fun.args = list(probs = .9)) +
coord_trans(y='sqrt')
p3 <- ggplot(aes(x=quality,y=volatile.acidity),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2, color = 'red') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1)) +
geom_line(stat = 'summary',fun.y= quantile, fun.args = list(probs = .9)) +
coord_trans(y='log10')
grid.arrange(p1,p2,p3)
## Warning: Removed 15 rows containing non-finite values (stat_summary).
## Warning: Removed 15 rows containing non-finite values (stat_summary).
## Warning: Removed 15 rows containing non-finite values (stat_summary).
## Warning: Removed 15 rows containing missing values (geom_point).

#no to transformation. slight downward trend still visible.
#Where's the correlation between Volatile and Fixed Acidity?
ggplot(aes(x=fixed.acidity,y=volatile.acidity),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0))

#seems to be a bit of a cluster - the majority of results have low fixed and volatile acidity
#what if we knock out the outliers?
ggplot(aes(x=fixed.acidity,y=volatile.acidity,color=quality),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
xlim(0,quantile(rwine$fixed.acidity,0.99)) +
ylim(0,quantile(rwine$volatile.acidity,0.99)) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), na.value = "grey50", guide = "colourbar")
## Warning: Removed 30 rows containing missing values (geom_point).

#it looks like fixed acidity and volatile acidity are related and pretty constant - as you'd expect
#there seems to be no particular correlation with quality so perhaps a dead end.
#Possible hypothesis - Volatile Acidity may have a small impact on quality - the lower the acidity, the better the wine
#this may or may not be significant.
#3. SULPHATES
rwine.sul_by_q <- summarise(q_groups,
sulph_mean = mean(sulphates),
sulph_median = median(sulphates),
n = n())
rwine.sul_by_q
## Source: local data frame [6 x 4]
##
## quality sulph_mean sulph_median n
## (int) (dbl) (dbl) (int)
## 1 3 0.5700000 0.545 10
## 2 4 0.5964151 0.560 53
## 3 5 0.6209692 0.580 681
## 4 6 0.6753292 0.640 638
## 5 7 0.7412563 0.740 199
## 6 8 0.7677778 0.740 18
#looks like a pretty clear positive trend in both mean and median
ggplot(rwine, aes(x=sulphates)) + geom_histogram(binwidth=0.2) + facet_grid(~quality)

#histograms back this up..
ggplot(aes(x=quality,y=sulphates),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
stat_smooth(method = lm)

#what about mean and quantiles and knocking off 1%
ggplot(aes(x=quality,y=sulphates),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2, color = 'red') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1)) +
geom_line(stat = 'summary',fun.y= quantile, fun.args = list(probs = .9)) +
ylim(0,quantile(rwine$sulphates,0.99))
## Warning: Removed 16 rows containing non-finite values (stat_summary).
## Warning: Removed 16 rows containing non-finite values (stat_summary).
## Warning: Removed 16 rows containing non-finite values (stat_summary).
## Warning: Removed 16 rows containing missing values (geom_point).

#slight positive trend still visible..
#not trying transformations as the original data was normal
#What about Citric Acid?
ggplot(aes(x=citric.acid,y=sulphates,color=quality),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
stat_smooth(method = lm) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), na.value = "grey50", guide = "colourbar")

#no very clear trends there... pretty constand and quality well distributed
#or Chlorides?
ggplot(aes(x=chlorides,y=sulphates,color=quality),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
stat_smooth(method = lm) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), na.value = "grey50", guide = "colourbar")

#this looks interesting. Lets take out some outliers...
ggplot(aes(x=chlorides,y=sulphates,color=quality),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
stat_smooth(method = lm) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), na.value = "grey50", guide = "colourbar") +
ylim(0,quantile(rwine$sulphates,0.95)) +
xlim(0,quantile(rwine$chlorides,0.95))
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## Warning: Removed 132 rows containing missing values (geom_point).

#this backs up the theory that higher sulphate levels = better quality wine
#like volatile and fixed acidity there is a pretty constant and logical cluster between chlorides and sulphates but no further statistical information can be gleaned.
#Possible hypothesis - Sulphates may have a small impact on quality - the higher the sulphate level, the better the wine
#Conclusions: What are my final three plots?
#What do they need to do:
#* Draw comparisons.
#* Identify trends.
#* Engage a wide audience.
#* Explain a complicated finding.
#* Clarify a gap between perception and reality.
#* Enable the reader to digest large amounts of information.
#Three possible hypotheses so far:
#1. Low density levels + low fixed.acidity + higher ph levels (over 3?) = higher alcohol content = better quality wine
#2. higher sulphate levels = better the wine
#3. lower volatile acidity = better the wine
#I'm less confident about the second two. clearly H1 is the more complicated!
#I'm also not 100% sure that pH has a significant effect...
#Plot One: Alcohol, Quality and related values - following the trail
#Explanation: There is a clear positive trend between Alcohol Level and Quality (Pearson's R: 0.479)
#There is a clear negative correlation between Density and Alcohol (Pearson's R: 0.496)
#There is a clear positive correlation between Density and Fixed Acidity (Pearson'R: 0.675)
#There is a clear negative correlation between Fixed Acidity and pH (Pearson's R: 0.706)
#Possible conclusion low density + low fixed acidity + high ph = high alcohol = better wine?
#Here's the graphic to back me up...
p1 = ggplot(aes(y=alcohol,x=density, color=quality),data = rwine) +
geom_jitter(alpha=1/2) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), guide = "legend") +
stat_smooth(method = 'lm') +
ggtitle("Alcohol v Density - entire dataset (1599 rows)")
rwine2 = subset(rwine, fixed.acidity < 9, pH >3)
p2 = ggplot(aes(y=alcohol,x=density, color=quality),data = rwine2) +
geom_jitter(alpha=1/2) +
scale_colour_gradient(low = I('yellow'), high = I('purple'), guide = "legend") +
stat_smooth(method='lm') + xlab("My x label") +
ggtitle("Alcohol v Density - where Fixed.Acidity < 9 and pH > 3 (1132 rows)")
grid.arrange(p1,p2,p3)

#Conclusion: the graphs show a clear link between alcohol content, density and wine quality
#Factoring in a lower Fixed acidity level and a higher pH level makes the regression stronger.
#Plot Two: Sulphate and Quality
ggplot(aes(x=quality,y=sulphates),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2, color = 'red') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1)) +
geom_line(stat = 'summary',fun.y= quantile, fun.args = list(probs = .9)) +
ylim(0,quantile(rwine$sulphates,0.99))+
ggtitle("Sulphate v Quality - 16 outliers removed")
## Warning: Removed 16 rows containing non-finite values (stat_summary).
## Warning: Removed 16 rows containing non-finite values (stat_summary).
## Warning: Removed 16 rows containing non-finite values (stat_summary).
## Warning: Removed 16 rows containing missing values (geom_point).

#Conclusion: There is a link between Sulphates and Quality which is worth further statistical analysis
#Plot Three: Volatile Acidity and Quality
ggplot(aes(x=quality,y=volatile.acidity),data = rwine) +
geom_point(alpha = 1/5, position = position_jitter(h = 0)) +
geom_line(stat = 'summary', fun.y = mean, linetype = 2, color = 'red') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1)) +
geom_line(stat = 'summary',fun.y= quantile, fun.args = list(probs = .9)) +
ylim(0,quantile(rwine$sulphates,0.99))+
ggtitle("Volatile Acidity v Quality - 3 outliers removed")
## Warning: Removed 3 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing non-finite values (stat_summary).
## Warning: Removed 3 rows containing missing values (geom_point).

#Conclusion: There is a link between Volatile Acidity and Quality which is worth further statistical analysis
#Overall Conclusion:
#The next step would a more indepth statistical analysis on whether the following factors contribute to a better quality wine:
#high alcohol level (over 10)
#low density (under 0.998)
#low fixed acidity (under 9)
#higher ph (over 3)
#low volatile acidity (under 0.75)
#higher sulphate levels (over 0.5)
#a more solid analysis could be made if we had access to data from different years.
#it is also worth baring in mind that the 'quality' is a sensory score given by 3 professionals.
#if the same professionals could be involved each year that would make the analysis more robust.